{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "from collections import defaultdict\n",
    "import random\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = malaya.preprocessing.TOKENIZER().tokenize\n",
    "sastrawi = malaya.stem.sastrawi()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malay-Dataset/master/dictionary/synonym/synonym0.json\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malay-Dataset/master/dictionary/synonym/synonym1.json\n",
    "files = ['synonym0.json', 'synonym1.json']\n",
    "synonyms = defaultdict(list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in files:\n",
    "    with open(file) as fopen:\n",
    "        data = json.load(fopen)\n",
    "    for i in data:\n",
    "        if not len(i[1]):\n",
    "            continue\n",
    "        synonyms[i[0]].extend(i[1])\n",
    "        for r in i[1]:\n",
    "            synonyms[r].append(i[0])\n",
    "            \n",
    "for k, v in synonyms.items():\n",
    "    synonyms[k] = list(set(v))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace(string, threshold, synonym):\n",
    "    for no, word in enumerate(string):\n",
    "        if word in synonym_dict and random.random() > threshold:\n",
    "            w = random.choice(synonym[word])\n",
    "            string[no] = w\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "import re\n",
    "from malaya.text.regex import _expressions\n",
    "\n",
    "def reset_t(tokens):\n",
    "    t = []\n",
    "    for i in range(len(tokens)):\n",
    "        t.append([tokens[i], 2])\n",
    "    return t\n",
    "\n",
    "def augment_17_0(t, row, threshold = 0.5):\n",
    "    text, tokens, tokens_lower = row\n",
    "    for i in range(len(tokens)):\n",
    "        if tokens_lower[i] in synonyms and random.random() > threshold:\n",
    "            w = random.choice(synonyms[tokens_lower[i]])\n",
    "            t[i][0] = w\n",
    "            t[i][1] = 17"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4/4 [00:00<00:00, 3877.33it/s]\n"
     ]
    }
   ],
   "source": [
    "results = []\n",
    "for text in tqdm(['Cerpen itu telah saya karang.', \n",
    "                  'Latihan itu mesti kau buat.',\n",
    "                  'Kereta itu saya beli daripada Ali.',\n",
    "                  'Dia memberi saya sebatang pena.']):\n",
    "    tokens = tokenizer(text)\n",
    "    t = reset_t(tokens)\n",
    "    t_ = copy.deepcopy(t)\n",
    "    tokens_lower = tokenizer(text.lower())\n",
    "    r = (t, tokens, tokens_lower)\n",
    "    augment_17_0(t_, r)\n",
    "    results.append((t, t_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[([['Cerpen', 2],\n",
       "   ['itu', 2],\n",
       "   ['telah', 2],\n",
       "   ['saya', 2],\n",
       "   ['karang', 2],\n",
       "   ['.', 2]],\n",
       "  [['Cerpen', 2],\n",
       "   ['itu', 2],\n",
       "   ['pernah', 17],\n",
       "   ['saya', 2],\n",
       "   ['karang', 2],\n",
       "   ['.', 2]]),\n",
       " ([['Latihan', 2],\n",
       "   ['itu', 2],\n",
       "   ['mesti', 2],\n",
       "   ['kau', 2],\n",
       "   ['buat', 2],\n",
       "   ['.', 2]],\n",
       "  [['senaman', 17],\n",
       "   ['itu', 2],\n",
       "   ['mesti', 2],\n",
       "   ['kau', 2],\n",
       "   ['mengatur', 17],\n",
       "   ['.', 2]]),\n",
       " ([['Kereta', 2],\n",
       "   ['itu', 2],\n",
       "   ['saya', 2],\n",
       "   ['beli', 2],\n",
       "   ['daripada', 2],\n",
       "   ['Ali', 2],\n",
       "   ['.', 2]],\n",
       "  [['Kereta', 2],\n",
       "   ['itu', 2],\n",
       "   ['saya', 2],\n",
       "   ['memperoleh', 17],\n",
       "   ['daripada', 2],\n",
       "   ['Ali', 2],\n",
       "   ['.', 2]]),\n",
       " ([['Dia', 2],\n",
       "   ['memberi', 2],\n",
       "   ['saya', 2],\n",
       "   ['sebatang', 2],\n",
       "   ['pena', 2],\n",
       "   ['.', 2]],\n",
       "  [['Dia', 2],\n",
       "   ['menampung', 17],\n",
       "   ['saya', 2],\n",
       "   ['sebatang', 2],\n",
       "   ['pena', 2],\n",
       "   ['.', 2]])]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
